Dual CRISPR Screen Analysis

Construct Filter

Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

Instructions

To run this notebook reproducibly, follow these steps:

  1. Click Kernel > Restart & Clear Output
  2. When prompted, click the red Restart & clear all outputs button
  3. Fill in the values for your analysis for each of the variables in the Input Parameters section
  4. Click Cell > Run All

Input Parameters


In [ ]:
g_num_processors = 3
g_trimmed_fastqs_dir = "/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/processed/runTemp"
g_filtered_fastas_dir = ""
g_min_trimmed_grna_len = 19
g_max_trimmed_grna_len = 21
g_len_of_seq_to_match = 19
g_code_location = "/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python"

CCBB Library Imports


In [ ]:
import sys
sys.path.append(g_code_location)

Automated Set-Up


In [ ]:
# %load -s describe_var_list /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/utilities/analysis_run_prefixes.py
def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)

In [ ]:
from ccbbucsd.utilities.analysis_run_prefixes import check_or_set, get_run_prefix, get_timestamp
g_filtered_fastas_dir = check_or_set(g_filtered_fastas_dir, g_trimmed_fastqs_dir)
print(describe_var_list(['g_filtered_fastas_dir']))

In [ ]:
from ccbbucsd.utilities.files_and_paths import verify_or_make_dir
verify_or_make_dir(g_filtered_fastas_dir)

Info Logging Pass-Through


In [ ]:
from ccbbucsd.utilities.notebook_logging import set_stdout_info_logger
set_stdout_info_logger()

Construct Filtering Functions


In [ ]:
import enum

In [ ]:
# %load -s TrimType,get_trimmed_suffix /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/scaffold_trim.py
class TrimType(enum.Enum):
    FIVE = "5"
    THREE = "3"
    FIVE_THREE = "53"

def get_trimmed_suffix(trimtype):
    return "_trimmed{0}.fastq".format(trimtype.value)

In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/count_filterer.py
# standard libraries
import logging

# ccbb libraries
from ccbbucsd.utilities.bio_seq_utilities import trim_seq
from ccbbucsd.utilities.basic_fastq import FastqHandler, paired_fastq_generator
from ccbbucsd.utilities.files_and_paths import transform_path

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "development"


def get_filtered_file_suffix():
    return "_len_filtered.fastq"


def filter_pair_by_len(min_len, max_len, retain_len, output_dir, fw_fastq_fp, rv_fastq_fp):
    fw_fastq_handler = FastqHandler(fw_fastq_fp)
    rv_fastq_handler = FastqHandler(rv_fastq_fp)
    fw_out_handle, rv_out_handle = _open_output_file_pair(fw_fastq_fp, rv_fastq_fp, output_dir)
    counters = {"num_pairs": 0, "num_pairs_passing": 0}

    filtered_fastq_records = _filtered_fastq_generator(fw_fastq_handler, rv_fastq_handler, min_len, max_len, retain_len,
                                                       counters)
    for fw_record, rv_record in filtered_fastq_records:
        fw_out_handle.writelines(fw_record.lines)
        rv_out_handle.writelines(rv_record.lines)

    fw_out_handle.close()
    rv_out_handle.close()
    return _summarize_counts(counters)


def _filtered_fastq_generator(fw_fastq_handler, rv_fastq_handler, min_len, max_len, retain_len, counters):
    paired_fastq_records = paired_fastq_generator(fw_fastq_handler, rv_fastq_handler, True)
    for curr_pair_fastq_records in paired_fastq_records:
        counters["num_pairs"] += 1
        _report_progress(counters["num_pairs"])

        fw_record = curr_pair_fastq_records[0]
        fw_passing_seq = _check_and_trim_seq(_get_upper_seq(fw_record), min_len, max_len, retain_len, False)
        if fw_passing_seq is not None:
            rv_record = curr_pair_fastq_records[1]
            rv_passing_seq = _check_and_trim_seq(_get_upper_seq(rv_record), min_len, max_len, retain_len, True)
            if rv_passing_seq is not None:
                counters["num_pairs_passing"] += 1
                fw_record.sequence = fw_passing_seq
                fw_record.quality = trim_seq(fw_record.quality, retain_len, False)
                rv_record.sequence = rv_passing_seq
                rv_record.quality = trim_seq(rv_record.quality, retain_len, True)
                yield fw_record, rv_record


def _open_output_file_pair(fw_fastq_fp, rv_fastq_fp, output_dir):
    fw_fp = transform_path(fw_fastq_fp, output_dir, get_filtered_file_suffix())
    rv_fp = transform_path(rv_fastq_fp, output_dir, get_filtered_file_suffix())
    fw_handle = open(fw_fp, 'w')
    rv_handle = open(rv_fp, 'w')
    return fw_handle, rv_handle


def _report_progress(num_fastq_pairs):
    if num_fastq_pairs % 100000 == 0:
        logging.debug("On fastq pair number {0}".format(num_fastq_pairs))


def _get_upper_seq(fastq_record):
    return fastq_record.sequence.upper()


def _check_and_trim_seq(input_seq, min_len, max_len, retain_len, retain_5p_end):
    result = None
    seq_len = len(input_seq)
    if seq_len >= min_len and seq_len <= max_len:
        result = trim_seq(input_seq, retain_len, retain_5p_end)
    return result


def _summarize_counts(counts_by_type):
    summary_pieces = []
    sorted_keys = sorted(counts_by_type.keys())  # sort to ensure deterministic output ordering
    for curr_key in sorted_keys:
        curr_value = counts_by_type[curr_key]
        summary_pieces.append("{0}:{1}".format(curr_key, curr_value))
    result = ",".join(summary_pieces)
    return result

In [ ]:
from ccbbucsd.utilities.parallel_process_fastqs import parallel_process_paired_reads, concatenate_parallel_results

g_parallel_results = parallel_process_paired_reads(g_trimmed_fastqs_dir, get_trimmed_suffix(TrimType.FIVE_THREE), 
                                                   g_num_processors, filter_pair_by_len, 
                                                   [g_min_trimmed_grna_len, g_max_trimmed_grna_len, 
                                                    g_len_of_seq_to_match, g_filtered_fastas_dir])

In [ ]:
print(concatenate_parallel_results(g_parallel_results))